library(tidyverse)
library(data.table)
library(lubridate)
library(gridExtra)
library(plotly)
library(scales)
library(fBasics)
library(reshape)
library(ggrepel)
library(knitr)
library(forecast)
library(MLmetrics)
df<-read.csv('so2_no2_o3.csv')
temp<-df[,c('SO2','O3','NO2')]
na_df<-data.frame(percent=round(colSums(is.na(temp))/nrow(temp)*100))
na_df$poll <- rownames(na_df)
na_df$pollutant<-factor(na_df$poll, as.character(na_df$poll))
ggplot(na_df, aes(pollutant, percent,fill=pollutant))+
geom_bar(stat="identity") +scale_fill_manual(values = c("magenta3","goldenrod3","lightslateblue"))+
geom_text(data=na_df, aes(label=paste0(percent,"%"),
y=percent+0.7), size=4)+
labs(x = "Pollutants", y = "Percentage",
title = "Percentage of NAs in Three Pollutants")
summary(df[,c("SO2","O3","NO2")])
colnames(df)
melt_nyc <- melt(df,id.vars='datetime_local', measure.vars=c('SO2','O3','NO2'))
three_box_obs<-ggplot(na.omit(melt_nyc),aes(x=variable, y=value, color=variable)) +
geom_boxplot()+ coord_flip()+
scale_colour_manual(values=c("magenta3","goldenrod3","lightslateblue"))+
theme(legend.position="none")+scale_y_continuous(breaks = seq(0, 700, by = 50))+
labs(title="Distribution of pollutants",x='Pollutants',y='Parts per billion')
three_hist_obs<-ggplot(na.omit(melt_nyc),aes(x = value,fill=variable)) +
facet_wrap(~variable, nrow = 1) +
geom_histogram(binwidth=20)+
scale_fill_manual(values=c("magenta3","goldenrod3","lightslateblue"))+
theme(legend.position="none")+
labs(x='Parts per billion',y='Pollutants')
grid.arrange(three_box_obs,three_hist_obs,nrow=2)
max_pollutants_per_day<-df%>%
as.data.frame %>%
select('datetime_local','SO2','O3','NO2')%>%
mutate(date_ymd=as.Date(datetime_local,format="%Y-%m-%d"))%>%
group_by(date_ymd)%>%
summarise(SO2=max(SO2,na.rm=TRUE),
O3=max(O3,na.rm=TRUE),
NO2=max(NO2,na.rm=TRUE)
)
max_pollutants_df<-as.data.frame(max_pollutants_per_day)
melt_nyc_max_pollutants <- melt(max_pollutants_df,id.vars='date_ymd', measure.vars=c('SO2','O3','NO2'))
three_box_daily_max<-ggplot(melt_nyc_max_pollutants,aes(x=variable, y=value, color=variable)) +
geom_boxplot()+ coord_flip()+
scale_colour_manual(values=c("magenta3","goldenrod3","lightslateblue"))+
theme(legend.position="none")+scale_y_continuous(breaks = seq(0, 700, by = 50))+
labs(title="Distribution of pollutants on Monthly Average values",x='Pollutants',y='Parts per billion')
three_hist_daily_max<-ggplot(melt_nyc_max_pollutants,aes(x = value,fill=variable)) +
facet_wrap(~variable, nrow = 1) +
geom_histogram(binwidth=20)+
scale_fill_manual(values=c("magenta3","goldenrod3","lightslateblue"))+
theme(legend.position="none")+
labs(x='Parts per billion',y='Count')
grid.arrange(three_box_daily_max,three_hist_daily_max,nrow=2)
plot_pollutant<-function(pol,title,colour){
pol<-enquo(pol)
date_df<-df%>%
mutate(date_ymd=as.Date(datetime_local,format="%Y-%m-%d"))%>%
group_by(date_ymd)%>%
summarise(max_emission=max(!!pol,na.rm=TRUE))
plot1<-ggplot(data=date_df,aes(x=date_ymd,y=max_emission))+geom_line(color=colour)+
scale_x_date(breaks = seq(as.Date("1990-01-01"), as.Date("2017-12-31"), by="6 months"),labels = date_format("%b-%y"))+
xlab('Date')+ylab('Parts per billion')+ggtitle(paste('Daily Maximum Emission of',title))+
theme(axis.text.x = element_text(angle = 45, hjust = 1),legend.position="none")
month_df<-date_df%>%
mutate(y_m_d=paste(substr(date_ymd,1,nchar(date_ymd)+2),"-01"))%>%
group_by(y_m_d)%>%
summarise(avg_emission=mean(max_emission,na.rm=TRUE))
plot2<-ggplot(data=month_df,aes(x=as.Date(y_m_d, format = "%Y - %m - %d"),y=avg_emission))+geom_line(color=colour)+
scale_x_date(breaks = seq(as.Date("1990-01-01"), as.Date("2017-12-31"), by="6 months"), date_labels = "%b-%y")+
xlab('Date')+ylab('Parts per billion')+ggtitle(paste('Monthly Average Emission of',title))+
theme(axis.text.x = element_text(angle = 45, hjust = 1),legend.position="none")
grid.arrange(plot1,plot2,nrow=2)
}
plot_pollutant(SO2,"Sulpher Dioxide","magenta3")
plot_pollutant(O3,"Ozone","goldenrod3")
plot_pollutant(NO2,"Nitrogen Dioxide","lightslateblue")
plot_scale_monthly <- function(pol,title,colour) {
pol <- enquo(pol)
daily_max<-df%>%
mutate(date_ymd=as.Date(datetime_local,format="%Y-%m-%d"))%>%
group_by(date_ymd)%>%
summarise(max_emission=max(!!pol,na.rm=TRUE))
month_avg<-daily_max%>%
mutate(y_m_d=paste(substr(date_ymd,1,nchar(date_ymd)+2),"-01"))%>%
group_by(y_m_d)%>%
summarise(avg_emission=mean(max_emission,na.rm=TRUE))
ggplot(data=month_avg,aes(x=as.Date(y_m_d,format="%Y - %m - %d"),y=scale(avg_emission)))+ geom_line(color=colour)+
scale_x_date(breaks = seq(as.Date("1990-01-01"), as.Date("2017-12-31"), by="12 months"), date_labels = "%Y")+
xlab('')+ylab(title)+
theme(axis.text.x = element_text(angle = 45, hjust = 1),legend.position="none")
}
p1<-plot_scale_monthly(SO2,"Sulpher Dioxide","magenta3")
p2<-plot_scale_monthly(O3,"Ozone","goldenrod3")
p3<-plot_scale_monthly(NO2,"Nitrogen Dioxide","lightslateblue")
grid.arrange(p1,p2,p3,nrow=3,top = "Monthly Averaged on Daily Maximum Values")
df$month<-month(as.POSIXct(df$datetime_local))
df$hour<-hour(as.POSIXct(df$datetime_local))
pol_each_month<-function(pol,title,colour){
pol=enquo(pol)
each_month_pol_df<-df%>%
select('month','datetime_local','SO2','O3','NO2')%>%
mutate(month=as.Date(paste0("1990-",df$month,"-01"),"%Y-%m-%d"))%>%
group_by(month)%>%
summarise(count=mean(!!pol,na.rm=TRUE))
ggplot(each_month_pol_df,aes(month,count))+geom_line(color=colour)+
scale_x_date(breaks = seq(as.Date("1990-01-01"), as.Date("1990-12-01"), by="1 month"), date_labels = "%b")+
xlab('')+ylab(title)
}
m1<-pol_each_month(SO2,"Sulpher Dioxide","magenta3")
m2<-pol_each_month(O3,"Ozone","goldenrod3")
m3<-pol_each_month(NO2,"Nitrogen Dioxide","lightslateblue")
grid.arrange(m1,m2,m3,nrow=3,top = "Mean for Each Month")
pol_each_month<-function(pol,title,colour){
pol=enquo(pol)
each_month_pol_df<-df%>%
select('month','datetime_local','SO2','O3','NO2')%>%
mutate(month=as.Date(paste0("1990-",df$month,"-01"),"%Y-%m-%d"))%>%
group_by(month)%>%
summarise(count=max(!!pol,na.rm=TRUE))
ggplot(each_month_pol_df,aes(month,count))+geom_line(color=colour)+
scale_x_date(breaks = seq(as.Date("1990-01-01"), as.Date("1990-12-01"), by="1 month"), date_labels = "%b")+
xlab('')+ylab(title)
}
m1<-pol_each_month(SO2,"Sulpher Dioxide","magenta3")
m2<-pol_each_month(O3,"Ozone","goldenrod3")
m3<-pol_each_month(NO2,"Nitrogen Dioxide","lightslateblue")
grid.arrange(m1,m2,m3,nrow=3,top = "Max for Each Month")
pol_each_hour<-function(pol,title,colour){
pol=enquo(pol)
each_hour_pol_df<-df%>%
select('hour','datetime_local','SO2','O3','NO2')%>%
group_by(hour)%>%
summarise(count=max(!!pol,na.rm=TRUE))
ggplot(each_hour_pol_df,aes(as.numeric(hour),count))+geom_line(color=colour)+
scale_x_continuous(breaks=seq(0,23,1), limits=c(0,23)) +
xlab('')+ylab(title)
}
h1<-pol_each_hour(SO2,"Sulpher Dioxide","magenta3")
h2<-pol_each_hour(O3,"Ozone","goldenrod3")
h3<-pol_each_hour(NO2,"Nitrogen Dioxide","lightslateblue")
grid.arrange(h1,h2,h3,nrow=3,top = "Max for Each Hour")
pol_each_hour<-function(pol,title,colour){
pol=enquo(pol)
each_hour_pol_df<-df%>%
select('hour','datetime_local','SO2','O3','NO2')%>%
group_by(hour)%>%
summarise(count=mean(!!pol,na.rm=TRUE))
ggplot(each_hour_pol_df,aes(as.numeric(hour),count))+geom_line(color=colour)+
scale_x_continuous(breaks=seq(0,23,1), limits=c(0,23)) +
xlab('')+ylab(title)
}
h1<-pol_each_hour(SO2,"Sulpher Dioxide","magenta3")
h2<-pol_each_hour(O3,"Ozone","goldenrod3")
h3<-pol_each_hour(NO2,"Nitrogen Dioxide","lightslateblue")
grid.arrange(h1,h2,h3,nrow=3,top = "Mean for Each Hour")
df$datetime_local<-as.Date(df$datetime_local)
df$date_local<-as.Date(df$date_local)
colnames(df)
df_train <- subset(df, datetime_local >= "1998-01-01" & datetime_local <= "2014-12-31")
df_test <- subset(df, datetime_local >= "2015-01-01")
df_test$Month_Yr <- format(as.Date(df_test$date_local), "%Y-%m")
df_final <- subset(df, datetime_local >= "1998-01-01")
dim(df_train)
dim(df_test)
pollutant<-df_train%>%
as.data.frame %>%
mutate(date_ymd=as.Date(datetime_local,format="%Y-%m-%d"))%>%
group_by(date_ymd)%>%
summarise(max_pol=max(O3,na.rm=TRUE))%>%
as.data.frame %>%
mutate(y_m=paste(substr(date_ymd,1,nchar(date_ymd)+2),"-01"))%>%
group_by(y_m)%>%
summarise(avg_emission=mean(max_pol,na.rm=TRUE))
pol_flow<-data.frame("date"=as.Date(pollutant$y_m,format="%Y - %m - %d"),"count"=pollutant$avg_emission)
#converting to time series
pol_ts<-ts(pol_flow$count,frequency = 12,start = c(1998,01))
#ggplot format of ts plot
autoplot(pol_ts,colour ="goldenrod3") +xlab('Date')+ylab('Count')+ggtitle('Time Series of Ozone Pollutant')
pol_ts_decompose<-decompose(pol_ts)
actual<-autoplot(pol_ts_decompose$x)+xlab("Year")+ylab("Count")+ggtitle("Actual time series of ozone")
seas<-autoplot(pol_ts_decompose$seasonal)+xlab("Year")+ylab("Count")+ggtitle("Seasonality time series of ozone")
tren<-autoplot(pol_ts_decompose$trend)+xlab("Year")+ylab("Count")+ggtitle("Trend time series of ozone")
grid.arrange(actual,seas,tren,ncol=1,top="Decomposition of Ozone time series")
train_ts<-ts(pol_ts,start = c(1998,01),end=c(2014,12),frequency = 12)
train_pol_fit_arima<-auto.arima(train_ts)
train_pol_fit_arima
train_forecast_arima<-forecast(train_pol_fit_arima,36)
pollutant<-df_final%>%
as.data.frame %>%
mutate(date_ymd=as.Date(datetime_local,format="%Y-%m-%d"))%>%
group_by(date_ymd)%>%
summarise(max_pol=max(O3,na.rm=TRUE))%>%
as.data.frame %>%
mutate(y_m=paste(substr(date_ymd,1,nchar(date_ymd)+2),"-01"))%>%
group_by(y_m)%>%
summarise(avg_emission=mean(max_pol,na.rm=TRUE))
pol_flow_final<-data.frame("date"=as.Date(pollutant$y_m,format="%Y - %m - %d"),"count"=pollutant$avg_emission)
pol_ts_final<-ts(pol_flow_final$count,frequency = 12,start = c(1998,01))
dim(pol_flow_final)
train_forecast_arima_df<-data.frame(train_forecast_arima)
train_forecast_arima_pt_forecast<-train_forecast_arima_df$Point.Forecast
forecast_train_arima<-data.frame("x"=as.Date(pol_flow_final$date[205:240],format="%Y - %m - %d"),"y"=train_forecast_arima_pt_forecast)
actual_train_arima<-data.frame("x"=as.Date(pol_flow_final$date,format="%Y - %m - %d"),"y"=pol_flow_final$count)
ggplot(forecast_train_arima,aes(x,y))+geom_line(aes(color="First line")) +
geom_line(data = actual_train_arima,aes(color="Second line"))+xlab("Year")+ylab("Count")+
labs(colors="Series")+ggtitle("Training Vs Testing plot for Ozone-ARIMA")+
scale_colour_manual(values = c("red","green"),
labels=c("Forecast", "Actual"))
MAPE(pol_flow_final$count[205:240],train_forecast_arima_pt_forecast)
Box.test(train_forecast_arima$residuals, lag=36, type="Ljung-Box")
acf_var_arima<-acf(na.omit(train_forecast_arima$residuals),lag.max = 36,plot=FALSE)
autoplot(acf_var_arima)+labs(x='Lag',y='ACF',title='ACF plot')
autoplot(train_forecast_arima$residuals)+labs(x='Date',y='Residuals',title='Residual plot')
final_ts<-ts(pol_ts_final,start = c(1998,01),frequency = 12)
final_pol_fit_arima<-auto.arima(final_ts)
final_forecast_arima<-forecast(final_pol_fit_arima,24,level=c(80,95))
autoplot(final_forecast_arima)+labs(x='Date',y='Parts per billion',title='Forecast of ozone pollutant for 2018 & 2019 - ARIMA')
final_forecast_arima
final_forecast_arima
arima_forecast_df<-as.data.frame(final_forecast_arima)
colnames(arima_forecast_df)<-c('Forecast','Lo80','Hi80','Lo95','Hi95')
arima_forecast_df <- cbind('date' = rownames(arima_forecast_df), arima_forecast_df)
arima_forecast_df$date<-gsub(" ", "-", arima_forecast_df$date)
tst <- c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
arima_forecast_df$date<-(paste("01-", arima_forecast_df$date, sep = ""))
arima_forecast_df$date<-as.Date((arima_forecast_df$date),format="%d-%b-%Y")
#arima_forecast_df$date<-as.Date(arima_forecast_df$date,format= "%d-%b-%y")
arima_forecast_df
# Converting parts per billion to AQI for Ozone
arima_forecast_df$Forecast<-((arima_forecast_df$Forecast/1000)/0.08)*100
arima_forecast_df
arima_forecast_df$colour<- cut(arima_forecast_df$Forecast, breaks = c(-Inf,50,100,150,Inf),
labels = c("green", "yellow", "orange","red"))
after_pred<-ggplot(arima_forecast_df,aes(date,Forecast,fill=colour))+geom_bar(stat="identity",position="dodge")+
geom_text(aes(label = round(Forecast)), position = position_dodge(width = 1),
vjust = 1.5, size = 3) +
scale_x_date(breaks = seq(as.Date("2018-01-01"), as.Date("2019-12-01"), by="1 month"), date_labels = "%b-%y")+
theme(axis.text.x = element_text(angle = 45, hjust = 1),legend.position="none")+scale_fill_identity()+
ggtitle("Expected AQI level in 2018 & 2019 for Ozone")
# Converting parts per billion to AQI for Ozone
count_<-(pol_flow_final$count[217:240]/1000/0.08)*100
last_two_year<-data.frame("date"=pol_flow_final$date[217:240],"count"=count_)
last_two_year$colour<- cut(last_two_year$count, breaks = c(-Inf,50,100,150,Inf),
labels = c("green", "yellow", "orange","red"))
before_pred<-ggplot(last_two_year,aes(date,count,fill=colour))+geom_bar(stat="identity",position="dodge")+
geom_text(aes(label = round(count)), position = position_dodge(width = 1),
vjust = 1.5, size = 3) +
scale_x_date(breaks = seq(as.Date("2016-01-01"), as.Date("2017-12-01"), by="1 month"), date_labels = "%b-%y")+
theme(axis.text.x = element_text(angle = 45, hjust = 1),legend.position="none")+scale_fill_identity()+
ggtitle("AQI level in 2016 & 2017 for Ozone")
grid.arrange(before_pred,after_pred,nrow=2)
train_ts<-ts(pol_ts,start = c(1998,01),end=c(2014,12),frequency = 12)
train_pol_fit_ets<-ets(train_ts)
train_pol_fit_ets
train_forecast_ets<-forecast:::forecast.ets(train_pol_fit_ets,36)
train_forecast_ets_df<-data.frame(train_forecast_ets)
train_forecast_ets_pt_forecast<-train_forecast_ets_df$Point.Forecast
forecast_train<-data.frame("x"=as.Date(pol_flow_final$date[205:240],format="%Y - %m - %d"),"y"=train_forecast_arima_pt_forecast)
actual_train<-data.frame("x"=as.Date(pol_flow_final$date,format="%Y - %m - %d"),"y"=pol_flow_final$count)
ggplot(forecast_train,aes(x,y))+geom_line(aes(color="First line")) +
geom_line(data = actual_train,aes(color="Second line"))+xlab("Year")+ylab("Count")+
labs(color="Series")+ggtitle("Training Vs Testing plot for Ozone - ETS")+
scale_colour_manual(values = c("red","green"),
labels=c("Forecast", "Actual"))
MAPE(pol_flow_final$count[205:240],train_forecast_ets_pt_forecast)
Box.test(train_forecast_ets$residuals, lag=36, type="Ljung-Box")
#ACF
acf_var_ets<-acf(na.omit(train_forecast_ets$residuals),plot=FALSE)
autoplot(acf_var_ets)+labs(x='Lag',y='ACF',title='ACF plot')
autoplot(train_forecast_ets$residuals)+labs(x='Date',y='Residuals',title='Residual plot')
final_ts<-ts(pol_ts_final,start = c(1998,01),frequency = 12)
final_pol_fit_ets<-ets(final_ts)
final_forecast_ets<-forecast:::forecast.ets(final_pol_fit_ets,24)
autoplot(final_forecast_ets)+labs(x='Date',y='Parts per billion',title='Forecast of ozone pollutant for 2019 & 2020')
final_forecast_ets
pollutant<-df_train%>%
as.data.frame %>%
mutate(date_ymd=as.Date(datetime_local,format="%Y-%m-%d"))%>%
group_by(date_ymd)%>%
summarise(max_pol=max(SO2,na.rm=TRUE))%>%
as.data.frame %>%
mutate(y_m=paste(substr(date_ymd,1,nchar(date_ymd)+2),"-01"))%>%
group_by(y_m)%>%
summarise(avg_emission=mean(max_pol,na.rm=TRUE))
pol_flow<-data.frame("date"=as.Date(pollutant$y_m,format="%Y - %m - %d"),"count"=pollutant$avg_emission)
#converting to time series
pol_ts<-ts(pol_flow$count,frequency = 12,start = c(1998,01))
#ggplot format of ts plot
autoplot(pol_ts,colour ="magenta3") +xlab('Date')+ylab('Count')+ggtitle('Time Series of Sulpher Dioxide Pollutant')
pol_ts_decompose<-decompose(pol_ts)
actual<-autoplot(pol_ts_decompose$x)+xlab("Year")+ylab("Count")+ggtitle("Actual time series of Sulpher Dioxide")
seas<-autoplot(pol_ts_decompose$seasonal)+xlab("Year")+ylab("Count")+ggtitle("Seasonality time series of Sulpher Dioxide")
tren<-autoplot(pol_ts_decompose$trend)+xlab("Year")+ylab("Count")+ggtitle("Trend time series of Sulpher Dioxide")
grid.arrange(actual,seas,tren,ncol=1,top="Decomposition of Sulpher Dioxide time series")
train_ts<-ts(pol_ts,start = c(1998,01),end=c(2014,12),frequency = 12)
train_pol_fit_arima<-auto.arima(train_ts)
train_pol_fit_arima
train_forecast_arima<-forecast(train_pol_fit_arima,36)
df_test$Month_Yr <- format(as.Date(df_test$date_local), "%Y-%m")
df_final <- subset(df, as.Date(date_local) >= "1998-01-01")
pollutant<-df_final%>%
as.data.frame %>%
mutate(date_ymd=as.Date(date_local,format="%Y-%m-%d"))%>%
group_by(date_ymd)%>%
summarise(max_pol=max(SO2,na.rm=TRUE))%>%
as.data.frame %>%
mutate(y_m=paste(substr(date_ymd,1,nchar(date_ymd)+2),"-01"))%>%
group_by(y_m)%>%
summarise(avg_emission=mean(max_pol,na.rm=TRUE))
pol_flow_final<-data.frame("date"=as.Date(pollutant$y_m,format="%Y - %m - %d"),"count"=pollutant$avg_emission)
pol_ts_final<-(ts(pol_flow_final$count,frequency = 12,start = c(1998,01)))
train_forecast_arima_df<-data.frame(train_forecast_arima)
train_forecast_arima_pt_forecast<-train_forecast_arima_df$Point.Forecast
forecast_train_arima<-data.frame("x"=as.Date(pol_flow_final$date[205:240],format="%Y - %m - %d"),"y"=train_forecast_arima_pt_forecast)
actual_train_arima<-data.frame("x"=as.Date(pol_flow_final$date,format="%Y - %m - %d"),"y"=pol_flow_final$count)
ggplot(forecast_train_arima,aes(x,y))+geom_line(aes(color="First line")) +
geom_line(data = actual_train_arima,aes(color="Second line"))+xlab("Year")+ylab("Count")+
labs(colors="Series")+ggtitle("Training Vs Testing plot for SO2 - ARIMA")+
scale_colour_manual(values = c("red","green"),
labels=c("Forecast", "Actual"))
MAPE(pol_flow_final$count[205:240],train_forecast_arima_pt_forecast)
acf_var_arima<-acf(na.omit(train_forecast_arima$residuals),lag.max = 36,plot=FALSE)
autoplot(acf_var_arima)+labs(x='Lag',y='ACF',title='ACF plot')
autoplot(train_forecast_ets$residuals)+labs(x='Date',y='Residuals',title='Residual plot')
final_ts<-ts(pol_ts_final,start = c(1998,01),frequency = 12)
final_pol_fit_arima<-auto.arima(final_ts)
final_forecast_arima<-forecast(final_pol_fit_arima,24,level=c(80,95))
autoplot(final_forecast_arima)+labs(x='Date',y='Parts per billion',title='Forecast of Sulpher Dioxide pollutant for 2018 & 2019')
final_forecast_arima
train_ts<-ts(pol_ts,start = c(1998,01),end=c(2014,12),frequency = 12)
train_pol_fit_ets<-ets(train_ts)
train_pol_fit_ets
train_forecast_ets<-forecast:::forecast.ets(train_pol_fit_ets,36)
train_forecast_ets_df<-data.frame(train_forecast_ets)
train_forecast_ets_pt_forecast<-train_forecast_ets_df$Point.Forecast
forecast_train<-data.frame("x"=as.Date(pol_flow_final$date[205:240],format="%Y - %m - %d"),"y"=train_forecast_arima_pt_forecast)
actual_train<-data.frame("x"=as.Date(pol_flow_final$date,format="%Y - %m - %d"),"y"=pol_flow_final$count)
ggplot(forecast_train,aes(x,y))+geom_line(aes(color="First line")) +
geom_line(data = actual_train,aes(color="Second line"))+xlab("Year")+ylab("Count")+
labs(color="Series")+ggtitle("Training Vs Testing plot for SO2 - ETS")+
scale_colour_manual(values = c("red","green"),
labels=c("Forecast", "Actual"))
MAPE(pol_flow_final$count[205:240],train_forecast_ets_pt_forecast)
Box.test(train_forecast_ets$residuals, lag=36, type="Ljung-Box")
#ACF
acf_var_ets<-acf(na.omit(train_forecast_ets$residuals),plot=FALSE)
autoplot(acf_var_ets)+labs(x='Lag',y='ACF',title='ACF plot')
autoplot(train_forecast_ets$residuals)+labs(x='Date',y='Residuals',title='Residual plot')
final_ts<-ts(pol_ts_final,start = c(1998,01),frequency = 12)
final_pol_fit_ets<-ets(final_ts)
final_forecast_ets<-forecast:::forecast.ets(final_pol_fit_ets,24)
autoplot(final_forecast_ets)+labs(x='Date',y='Parts per billion',title='Forecast of Sulpher Dioxide pollutant for 2018 & 2019 - ETS')
final_forecast_ets
ets_forecast_df<-as.data.frame(final_forecast_ets)
colnames(ets_forecast_df)<-c('Forecast','Lo80','Hi80','Lo95','Hi95')
ets_forecast_df <- cbind('date' = rownames(ets_forecast_df), ets_forecast_df)
ets_forecast_df$date<-gsub(" ", "-", ets_forecast_df$date)
tst <- c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
ets_forecast_df$date<-(paste("01-", ets_forecast_df$date, sep = ""))
ets_forecast_df$date<-as.Date((ets_forecast_df$date),format="%d-%b-%Y")
#ets_forecast_df$date<-as.Date(ets_forecast_df$date,format= "%d-%b-%y")
ets_forecast_df
# Converting parts per billion to AQI for SO2
ets_forecast_df$Forecast<-((ets_forecast_df$Forecast/1000)/0.14)*100
ets_forecast_df
ets_forecast_df$colour<- cut(ets_forecast_df$Forecast, breaks = c(-Inf,50,100,150,Inf),
labels = c("green", "yellow", "orange","red"))
after_pred<-ggplot(ets_forecast_df,aes(date,Forecast,fill=colour))+geom_bar(stat="identity",position="dodge")+
geom_text(aes(label = round(Forecast)), position = position_dodge(width = 1),
vjust = 1.5, size = 3) +
scale_x_date(breaks = seq(as.Date("2018-01-01"), as.Date("2019-12-01"), by="1 month"), date_labels = "%b-%y")+
theme(axis.text.x = element_text(angle = 45, hjust = 1),legend.position="none")+scale_fill_identity()+
ggtitle("Expected AQI level in 2018 & 2019 for SO2")
# Converting parts per billion to AQI for SO2
count_<-(((pol_flow_final$count[217:240])/1000)/0.14)*100
last_two_year<-data.frame("date"=pol_flow_final$date[217:240],"count"=count_)
last_two_year$colour<- cut(last_two_year$count, breaks = c(-Inf,50,100,150,Inf),
labels = c("green", "yellow", "orange","red"))
before_pred<-ggplot(last_two_year,aes(date,count,fill=colour))+geom_bar(stat="identity",position="dodge")+
geom_text(aes(label = round(count)), position = position_dodge(width = 1),
vjust = 1.5, size = 3) +
scale_x_date(breaks = seq(as.Date("2016-01-01"), as.Date("2017-12-01"), by="1 month"), date_labels = "%b-%y")+
theme(axis.text.x = element_text(angle = 45, hjust = 1),legend.position="none")+scale_fill_identity()+
ggtitle("AQI level in 2016 & 2017 for SO2")
grid.arrange(before_pred,after_pred,nrow=2)
pollutant<-df_train%>%
as.data.frame %>%
mutate(date_ymd=as.Date(datetime_local,format="%Y-%m-%d"))%>%
group_by(date_ymd)%>%
summarise(max_pol=max(NO2,na.rm=TRUE))%>%
as.data.frame %>%
mutate(y_m=paste(substr(date_ymd,1,nchar(date_ymd)+2),"-01"))%>%
group_by(y_m)%>%
summarise(avg_emission=mean(max_pol,na.rm=TRUE))
pol_flow<-data.frame("date"=as.Date(pollutant$y_m,format="%Y - %m - %d"),"count"=pollutant$avg_emission)
#converting to time series
pol_ts<-ts(pol_flow$count,frequency = 12,start = c(1998,01))
#ggplot format of ts plot
autoplot(pol_ts,colour ="lightslateblue") +xlab('Date')+ylab('Count')+ggtitle('Time Series of Nitrogen Dioxide Pollutant')
pol_ts_decompose<-decompose(pol_ts)
actual<-autoplot(pol_ts_decompose$x)+xlab("Year")+ylab("Count")+ggtitle("Actual time series of Nitrogen Dioxide")
seas<-autoplot(pol_ts_decompose$seasonal)+xlab("Year")+ylab("Count")+ggtitle("Seasonality time series of Nitrogen Dioxide")
tren<-autoplot(pol_ts_decompose$trend)+xlab("Year")+ylab("Count")+ggtitle("Trend time series of Nitrogen Dioxide")
grid.arrange(actual,seas,tren,ncol=1,top="Decomposition of Nitrogen Dioxide time series")
train_ts<-ts(pol_ts,start = c(1998,01),end=c(2014,12),frequency = 12)
train_pol_fit_arima<-auto.arima(train_ts,approx=FALSE)
train_pol_fit_arima
train_forecast_arima<-forecast(train_pol_fit_arima,36)
pollutant<-df_final%>%
as.data.frame %>%
mutate(date_ymd=as.Date(datetime_local,format="%Y-%m-%d"))%>%
group_by(date_ymd)%>%
summarise(max_pol=max(NO2,na.rm=TRUE))%>%
as.data.frame %>%
mutate(y_m=paste(substr(date_ymd,1,nchar(date_ymd)+2),"-01"))%>%
group_by(y_m)%>%
summarise(avg_emission=mean(max_pol,na.rm=TRUE))
pol_flow_final<-data.frame("date"=as.Date(pollutant$y_m,format="%Y - %m - %d"),"count"=pollutant$avg_emission)
pol_ts_final<-ts(pol_flow_final$count,frequency = 12,start = c(1998,01))
train_forecast_arima_df<-data.frame(train_forecast_arima)
train_forecast_arima_pt_forecast<-train_forecast_arima_df$Point.Forecast
forecast_train_arima<-data.frame("x"=as.Date(pol_flow_final$date[205:240],format="%Y - %m - %d"),"y"=train_forecast_arima_pt_forecast)
actual_train_arima<-data.frame("x"=as.Date(pol_flow_final$date,format="%Y - %m - %d"),"y"=pol_flow_final$count)
ggplot(forecast_train_arima,aes(x,y))+geom_line(aes(color="First line")) +
geom_line(data = actual_train_arima,aes(color="Second line"))+xlab("Year")+ylab("Count")+
labs(colors="Series")+ggtitle("Training Vs Testing plot for NO2 - ARIMA")+
scale_colour_manual(values = c("red","green"),
labels=c("Forecast", "Actual"))
MAPE(pol_flow_final$count[205:240],train_forecast_arima_pt_forecast)
Box.test(train_forecast_arima$residuals, lag=36, type="Ljung-Box")
acf_var_arima<-acf(na.omit(train_forecast_arima$residuals),lag.max = 36,plot=FALSE)
autoplot(acf_var_arima)+labs(x='Lag',y='ACF',title='ACF plot')
autoplot(train_forecast_arima$residuals)+labs(x='Date',y='Residuals',title='Residual plot')
final_ts<-ts(pol_ts_final,start = c(1998,01),frequency = 12)
final_pol_fit_arima<-auto.arima(final_ts)
final_forecast_arima<-forecast(final_pol_fit_arima,24,level=c(80,95))
autoplot(final_forecast_arima)+labs(x='Date',y='Parts per billion',title='Forecast of Nitrogen Dioxide pollutant for 2018 & 2019 - ARIMA')
final_forecast_arima
train_ts<-ts(pol_ts,start = c(1998,01),end=c(2014,12),frequency = 12)
train_pol_fit_ets<-ets(train_ts)
train_pol_fit_ets
train_forecast_ets<-forecast:::forecast.ets(train_pol_fit_ets,36)
train_forecast_ets_df<-data.frame(train_forecast_ets)
train_forecast_ets_pt_forecast<-train_forecast_ets_df$Point.Forecast
forecast_train<-data.frame("x"=as.Date(pol_flow_final$date[205:240],format="%Y - %m - %d"),"y"=train_forecast_arima_pt_forecast)
actual_train<-data.frame("x"=as.Date(pol_flow_final$date,format="%Y - %m - %d"),"y"=pol_flow_final$count)
ggplot(forecast_train,aes(x,y))+geom_line(aes(color="First line")) +
geom_line(data = actual_train,aes(color="Second line"))+xlab("Year")+ylab("Count")+
labs(color="Series")+ggtitle("Training Vs Testing plot for NO2 - ETS")+
scale_colour_manual(values = c("red","green"),
labels=c("Forecast", "Actual"))
MAPE(pol_flow_final$count[205:240],train_forecast_ets_pt_forecast)
Box.test(train_forecast_ets$residuals, lag=36, type="Ljung-Box")
#ACF
acf_var_ets<-acf(na.omit(train_forecast_ets$residuals),plot=FALSE)
autoplot(acf_var_ets)+labs(x='Lag',y='ACF',title='ACF plot')
autoplot(train_forecast_ets$residuals)+labs(x='Date',y='Residuals',title='Residual plot')
final_ts<-ts(pol_ts_final,start = c(1998,01),frequency = 12)
final_pol_fit_ets<-ets(final_ts)
final_forecast_ets<-forecast:::forecast.ets(final_pol_fit_ets,24)
autoplot(final_forecast_ets)+labs(x='Date',y='Parts per billion',title='Forecast of Nitrogen Dioxide pollutant for 2018 & 2019')
final_forecast_ets
ets_forecast_df<-as.data.frame(final_forecast_ets)
colnames(ets_forecast_df)<-c('Forecast','Lo80','Hi80','Lo95','Hi95')
ets_forecast_df <- cbind('date' = rownames(ets_forecast_df), ets_forecast_df)
ets_forecast_df$date<-gsub(" ", "-", ets_forecast_df$date)
tst <- c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
ets_forecast_df$date<-(paste("01-", ets_forecast_df$date, sep = ""))
ets_forecast_df$date<-as.Date((ets_forecast_df$date),format="%d-%b-%Y")
#ets_forecast_df$date<-as.Date(ets_forecast_df$date,format= "%d-%b-%y")
ets_forecast_df
# Converting parts per billion to AQI for NO2
ets_forecast_df$Forecast<-((ets_forecast_df$Forecast/1000)/0.053)*100
ets_forecast_df
ets_forecast_df$colour<- cut(ets_forecast_df$Forecast, breaks = c(-Inf,50,100,150,Inf),
labels = c("green", "yellow", "orange","red"))
after_pred<-ggplot(ets_forecast_df,aes(date,Forecast,fill=colour))+geom_bar(stat="identity",position="dodge")+
geom_text(aes(label = round(Forecast)), position = position_dodge(width = 1),
vjust = 1.5, size = 3) +
scale_x_date(breaks = seq(as.Date("2018-01-01"), as.Date("2019-12-01"), by="1 month"), date_labels = "%b-%y")+
theme(axis.text.x = element_text(angle = 45, hjust = 1),legend.position="none")+scale_fill_identity()+
ggtitle("Expected AQI level in 2018 & 2019 for NO2")
count_=((pol_flow_final$count[217:240]/1000)/0.053)*100
last_two_year<-data.frame("date"=pol_flow_final$date[217:240],"count"=count_)
last_two_year$colour<- cut(last_two_year$count, breaks = c(-Inf,50,100,150,Inf),
labels = c("green", "yellow", "orange","red"))
before_pred<-ggplot(last_two_year,aes(date,count,fill=colour))+geom_bar(stat="identity",position="dodge")+
geom_text(aes(label = round(count)), position = position_dodge(width = 1),
vjust = 1.5, size = 3) +
scale_x_date(breaks = seq(as.Date("2016-01-01"), as.Date("2017-12-01"), by="1 month"), date_labels = "%b-%y")+
theme(axis.text.x = element_text(angle = 45, hjust = 1),legend.position="none")+scale_fill_identity()+
ggtitle("AQI level in 2016 & 2017 for NO2")
grid.arrange(before_pred,after_pred,nrow=2)